import pandas as pd
import re
import copy
import csv
from rapidfuzz import process
from pathos.multiprocessing import Pool
def clean_name(name):
	replace_words = ["INCORPORATED","INC","LLC","LTD","LP","LIMITED","CORPORATION","CORP","COMPANY","COMAPNY","OF","THE","A","DE","CO","REPRESENTED","BY","TECHNOLOGIES","ENTERPRISE",
	"MANUFACTURING","LABORATORIES","PHARMACEUTICALS","INTERNATIONAL","ASSOCIATES","SYSTEMS","ENGINEERING","RESEARCH","THERAPEUTICS","ENVIRONMENTAL SOLUTIONS","SOLUTIONS",
	"ACQUISITION","COMMUNICATIONS","TELECOMMUNICATIONS","TECHNOLOGY","ADVANCED","INTERNATIONAL","ASSOCIATES","MEDICAL","SEMICONDUCTOR","NETWORK","INTELLECTUAL PROPERTY",
	"CONPANY","COMAPANY","COPANY","COMPAMY","COMNPANY","COMPANYH","CCPMANY","CONMPANY","COMPANY","COMPANHY","COMPNAY","COMPANYU","COMANY","COPANNY"]
	#First just remove periods and commas to account for "L.L.C", and "CORP.", as well as times when the stop word is followed by a comma
	name = re.sub("\\.", "", name)
	#This specific pattern is common (e.g. A CORP. OF NY)
	name = re.sub("\\bA\\sCORP(ORATION)?\\sOF\\s[A-Z]+\\b", "", name)
	#This specific pattern is common (e.g. A CALIFORNIA CORPORATION)
	name = re.sub("\\bA\\s[A-Z]+\\sCORP(ORATION)?\\b", "", name)
	#replace all word if they are at word boundries
	for word in replace_words:
		name = re.sub("\\b" + word +"\\b", "", name)
	#replace all non alphanumeric characters
	name = re.sub("\\W", "", name)
	return name
def findMatches(clean_names):
	accuracy = 90
	matches = {}
	pool = Pool()
	def innerMatch(clean):
		res = process.extract(clean, clean_names, limit=100, score_cutoff=accuracy)
		res = list(filter(lambda match: match[1] > accuracy, res))
		return [clean, list(map(lambda x: x[0], res))]
	parResults = pool.map(innerMatch, clean_names)
	pool.close()
	for r in parResults:
		matches[r[0]] = r[1]
	print("all matched")
	return matches
def clean_par(all_matches): 
	global repeat
	repeat = True
	pool = Pool()
	def cleanOne(biz):
		matches = all_matches[biz]
		#get all matches of these matches
		match_matches = list(matches)
		if len(match_matches) != 1:
			for match in matches:
				match_match_matches = list(all_matches[match])
				for match_match in match_match_matches:
					match_matches.append(match_match)
			match_matches = list(set(match_matches))
			if len(match_matches) != len(matches):
				print("Coalescing for " + biz)
				repeat = True
				all_matches[biz] = match_matches
	while repeat:
		repeat = False
		print("Repeating")
		pool.map(cleanOne, all_matches.keys())
		pool.close()
		print("looked at all matches")
	return all_matches
def clean(all_matches): 
	repeat = True
	while repeat:
		repeat = False
		print("Repeating")
		for biz in all_matches:
			matches = all_matches[biz]
			#get all matches of these matches
			match_matches = list(matches)
			if len(match_matches) != 1:
				for match in matches:
					match_match_matches = list(all_matches[match])
					for match_match in match_match_matches:
						match_matches.append(match_match)
				match_matches = list(set(match_matches))
				if len(match_matches) != len(matches):
					print("Coalescing for " + biz)
					repeat = True
					all_matches[biz] = match_matches
		print("looked at all matches")
	return all_matches
def representative(all_matches, firm_clean_pairs):
	final = {}
	count = 0
	for biz in all_matches:
		if(biz != ""):
			if count % 1000 == 0:
				print("Finding representative for " + str(count))
			count += 1
			reps = all_matches[biz]
			reps.sort()
			rep_short = reps[0]
			# rep_firm_clean_pairs = list(filter(lambda x: x[1] == rep_short, firm_clean_pairs))
			# rep_full_names = list(map(lambda x: x[0], rep_firm_clean_pairs))
			# rep_full_names.sort()
			# rep_name = rep_full_names[0]
			full_names = name_map[biz]
			for fullname in full_names:
				final[fullname] = rep_short
	return final
def write_to_file(rep):
	with open("results.csv", "w") as csv_file:
		csv_writer = csv.writer(csv_file, delimiter=',', quotechar='"')
		csv_writer.writerow(["ee_name", "ee_name_disambig"])
		for biz in rep:
			csv_writer.writerow([biz, rep[biz]])
# def run():
# 	print("Loading data")
# 	fr = pd.read_csv('firm_for_disambig.csv',nrows=1000,error_bad_lines=False)
# 	print("Data loaded")
# 	firm_names = [val for val in fr['ee_name_raw']]
# 	#unique names
# 	firm_names = list(set(filter(lambda x: x != "", firm_names)))
# 	name_map = {}
# 	clean_names = []
# 	for x in firm_names:
# 		clean = clean_name(str(x))
# 		clean_names.append(clean)
# 		full_names = name_map.setdefault(clean, [])
# 		full_names.append(x)
# 		name_map[clean] = full_names
# 	print("Names cleaned")
# 	matches = findMatches(clean_names)
# 	all_matches = clean(copy.copy(matches))
# 	rep = representative(all_matches, name_map)
# 	print(len(set(rep.values())))
# 	# distinct = set(map(lambda x: x[1], rep.values()))))
# 	# print("Distinct: " + str(distinct))
# 	write_to_file(rep)

def run():
	print("Loading data")
	fr = pd.read_csv('firm_for_disambig.csv',error_bad_lines=False)
	print("Data loaded")
	firm_names = [val for val in fr['ee_name_raw']]
	#unique names
	firm_names = list(set(filter(lambda x: x != "", firm_names)))
	firm_clean_pairs = list(map(lambda x: [str(x), clean_name(str(x))], firm_names))
	print("Names cleaned")
	clean_names = list(set(list(map(lambda x: x[1], firm_clean_pairs))))
	matches = findMatches(clean_names)
	all_matches = clean(copy.copy(matches))
	rep = representative(all_matches, firm_clean_pairs)
	print(len(set(rep.values())))
	# distinct = set(map(lambda x: x[1], rep.values()))))
	# print("Distinct: " + str(distinct))
	write_to_file(rep)
run()